NON-SMALL CELL LUNG CANCER ANALYSIS

Emile Cohen

June 2020

Goal: In this notebook, we want to understand what makes Colorectal Cancer a textbook case for the patterns we saw, and what are the major subcohorts that drive the signal.


In [15]:
%run -i '../../../../../utils/setup_environment.ipy'

import warnings
warnings.filterwarnings('ignore')
from scipy.stats import fisher_exact, ranksums, chi2, norm
from statsmodels.sandbox.stats.multicomp import multipletests
import matplotlib.gridspec as gridspec
import pickle

data_path = '../../../../../data/'
data_wgd = data_path + 'impact-facets-tp53/processed/wgd/'
data_no_wgd = data_path + 'impact-facets-tp53/processed/no_wgd/'
Setup environment... done!

✅ Working on **mskimpact_env** conda environment.

Interesting functions

In [16]:
from functools import reduce

def get_hotspots(df: pd.DataFrame, Sample_Type: str, group: list = None, group_type:str = None):
    data = df[df['Sample_Type'] == Sample_Type]
    
    if group and group_type:
        data = data[data[group_type].isin(group)]

    data_1 = get_groupby(data,'tp53_spot_1', 'count'); data_2 = get_groupby(data,'tp53_spot_2', 'count'); data_3 = get_groupby(data,'tp53_spot_3', 'count') ; data_4 = get_groupby(data,'tp53_spot_4', 'count') ; data_5 = get_groupby(data,'tp53_spot_5', 'count') 
    series_data = [data_1,data_2,data_3,data_4,data_5]

    df_merged = reduce(lambda  left,right: pd.merge(left,right,left_index=True, right_index=True,
                                                how='outer'), series_data).fillna(0)

    df_merged.columns = ['count_1', 'count_2', 'count_3', 'count_4', 'count_5']
    df_merged['total'] = df_merged.sum(axis=1)
    df_merged = df_merged.sort_values(by='total', ascending=False)

    df_merged = df_merged.drop('nan')
    
    return df_merged

def get_hotspot_frac(df: pd.DataFrame, group_type:str = None, group: list = None, nb = 10):
    if group_type and group:
        df = df[df[group_type].isin(group)]
    result = [['spot', '#', 'frac']]
    for spot in get_groupby(df, 'tp53_spot_1', 'count').sort_values(by='count', ascending=False).head(nb).index.tolist():
        result.append([spot,df[df['tp53_spot_1'] == spot].frac_genome_altered.shape[0], df[df['tp53_spot_1'] == spot].frac_genome_altered.median()])

    return pd.DataFrame(result)


def boxplot_sampletype(df: pd.DataFrame, group:str, palette, order, metrics: str, figsize= (10,3), title: str = '', title_font: int=12, xlim=[0,1]):
    fig=plt.figure(figsize=figsize)
    ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)

    sns.boxplot(y=metrics, x=group,data=df,ax=ax, dodge=False,order=order, palette=palette).set_title(title, weight='bold', fontsize=title_font)
    
    groupby_ = get_groupby(df,group, 'count')
    groupby_ = groupby_.T
    for mut in mutation_list:
        if mut not in groupby_.columns:
            groupby_[mut] = 0
    groupby_ = groupby_.T
    
    labels = []
    for element in order:
        labels.append(element + '\n('+ str(groupby_.loc[element].values[0])+')')
    
    
    
    ax.set_xticklabels(labels)
    style(ax)
    ax.set_ylim(xlim)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    return fig, ax


# Let's give a look at medians and statistics

def get_statistics(df: pd.DataFrame, group:str, metrics: str, group_list: list):
    group_1 = df[df[group] == group_list[0]][metrics]
    group_2 = df[df[group] == group_list[1]][metrics]

    median_1 = group_1.median()
    median_2 = group_2.median()

    statistic, p_value = ranksums(group_1.dropna().values,group_2.dropna().values)
    
    results = [['', 'size', metrics],
               [group_list[0], group_1.shape[0], median_1],
               [group_list[1], group_2.shape[0], median_2],
               ['', 'Statistics', 'p-value'],
               ['', statistic, p_value]]


    return pd.DataFrame(results)

def get_major_codrivers(master: pd.DataFrame, maf: pd.DataFrame, head:int = 10, tp53=False):
    samples = master.Tumor_Id.tolist()
    if tp53:
        maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True]
    else:
        maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True][maf['Hugo_Symbol'] != 'TP53']
    h = pd.DataFrame(maf_filtered[['Hugo_Symbol']].groupby(['Hugo_Symbol']).size())
    h.columns = ['count']
    h = h.sort_values(by='count', ascending=False).head(head)
    
    return(h)

def create_co_drivers_table(master: pd.DataFrame, group_type:str,  group_1: str, group_2: str):
    master_group_1 = master[master[group_type] == group_1]
    co_drivers_group_1 = get_major_codrivers(master=master_group_1,
                        maf=maf_cohort_nowgd,
                       head=100)
    co_drivers_group_1['proportion_1'] = co_drivers_group_1.apply(lambda x: 100* round(x['count'] / co_drivers_group_1.sum().values[0], 4), axis=1)
    
    master_group_2 = master[master[group_type] == group_2]
    co_drivers_group_2 = get_major_codrivers(master=master_group_2,
                        maf=maf_cohort_nowgd,
                       head=100)
    co_drivers_group_2['proportion_2'] = co_drivers_group_2.apply(lambda x: 100* round(x['count'] / co_drivers_group_2.sum().values[0], 4), axis=1)
    
    co_drivers_groups = pd.merge(co_drivers_group_1, co_drivers_group_2, on='Hugo_Symbol')
    co_drivers_groups['proportion_1'] = - co_drivers_groups['proportion_1']
    
    return co_drivers_groups

Master Definition and Filtering

In [17]:
cancer = 'Non-Small Cell Lung Cancer'
In [18]:
master_no_wgd = non_wgd_load_and_cut(data_path + 'impact-facets-tp53/processed/no_wgd/master_no_wgd.pkl')
master_wgd = pd.read_pickle(data_path + 'impact-facets-tp53/processed/wgd/master_wgd.pkl')

master_no_wgd_cancer = master_no_wgd[master_no_wgd['Cancer_Type'] == cancer]
master_wgd_cancer = master_wgd[master_wgd['Cancer_Type'] == cancer]

maf_cohort_nowgd = pd.read_csv(data_path + 'impact-facets-tp53/processed/no_wgd/maf_cohort_nowgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
maf_cohort_wgd = pd.read_csv(data_path + 'impact-facets-tp53/processed/wgd/maf_cohort_wgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
In [19]:
master_wgd[master_wgd['tp53_count'] >=1]
Out[19]:
Sample_Id Tumor_Id Patient_Id Cancer_Type Cancer_Type_Detailed Patient_Current_Age Sex Sample_Type purity ploidy Overall_Survival_Months Overall_Survival_Status MSI_Score MSI_Type TMB_Score tp53_key_1 tp53_vc_1 tp53_ccf_1 tp53_vaf_1 tp53_HGVSp_1 tp53_spot_1 tp53_vt_1 tp53_key_2 tp53_vc_2 tp53_ccf_2 tp53_vaf_2 tp53_HGVSp_2 tp53_spot_2 tp53_vt_2 tp53_key_3 tp53_vc_3 tp53_ccf_3 tp53_vaf_3 tp53_HGVSp_3 tp53_spot_3 tp53_vt_3 tp53_key_4 tp53_vc_4 tp53_ccf_4 tp53_vaf_4 tp53_HGVSp_4 tp53_spot_4 tp53_vt_4 tp53_key_5 tp53_vc_5 tp53_ccf_5 tp53_vaf_5 tp53_HGVSp_5 tp53_spot_5 tp53_vt_5 tp53_count tp53_tcn tp53_mcn tp53_lcn tp53_seg_length tp53_cn_state tp53_cf wgd gene_count mutation_count driver_gene_count driver_mutation_count snv_driver_mutation_count indel_driver_mutation_count max_vaf tp53_exp_nb_1 tp53_exp_nb_2 tp53_exp_nb_3 tp53_exp_nb_4 tp53_exp_nb_5 tp53_res_1 tp53_res_2 tp53_res_3 tp53_res_4 tp53_res_5 tp53_vc_group_1 tp53_vc_group_2 tp53_vc_group_3 tp53_vc_group_4 tp53_vc_group_5 tp53_vt_group_1 tp53_vt_group_2 tp53_vt_group_3 tp53_vt_group_4 tp53_vt_group_5 tp53_bi_state tp53_state_wgd tp53_loh_status tp53_mut_loh_group prewgd_tp53_group_1 prewgd_tp53_group_2 chr_affected chr_loss chr_gain chr_cnloh frac_genome_altered
1 P-0036909-T01-IM6_P-0036909-N01-IM6 P-0036909-T01-IM6 P-0036909 Non-Small Cell Lung Cancer Lung Adenocarcinoma 47.0 Female Metastasis 0.391316 2.871793 14.137 LIVING 0.37 Stable 3.5 P-0036909-T01-IM6-17_7577121_G_A Missense_Mutation 0.798 0.312169 p.Arg273Cys 273 SNP None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None 1 2 2.0 0.0 18101929 LOSS BEFORE 0.306121 True 2 3 1 2 2 0 0.312169 1.595485 NaN NaN NaN NaN 0.404515 NaN NaN NaN NaN 273 None None None None SNV None None None None uncertain uncertain_bi True 1_loh None None 19 19 1 2 0.839
2 P-0023546-T01-IM6_P-0023546-N01-IM6 P-0023546-T01-IM6 P-0023546 Prostate Cancer Prostate Neuroendocrine Carcinoma 50.0 Male Primary 0.865628 3.115253 4.800 DECEASED 2.37 Stable 3.5 P-0023546-T01-IM6-17_7578442_T_C Missense_Mutation 0.933 0.845070 p.Tyr163Cys 163 SNP None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None 1 3 3.0 0.0 25250470 CNLOH BEFORE & LOSS 0.835504 True 3 3 1 1 1 0 0.845070 2.797574 NaN NaN NaN NaN 0.202426 NaN NaN NaN NaN missense None None None None SNV None None None None bi bi True 1_loh bi bi 18 13 4 4 0.836
3 P-0023546-T02-IM6_P-0023546-N01-IM6 P-0023546-T02-IM6 P-0023546 Prostate Cancer Prostate Adenocarcinoma 50.0 Male Primary 0.312907 3.136841 4.800 DECEASED 0.82 Stable 2.6 P-0023546-T02-IM6-17_7578442_T_C Missense_Mutation 1.000 0.636735 p.Tyr163Cys 163 SNP None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None 1 2 2.0 0.0 25237770 LOSS BEFORE 0.312907 True 2 2 1 1 1 0 0.636735 4.069805 NaN NaN NaN NaN -2.069805 NaN NaN NaN NaN missense None None None None SNV None None None None bi bi True 1_loh bi bi 18 13 5 1 0.775
4 P-0018837-T01-IM6_P-0018837-N01-IM6 P-0018837-T01-IM6 P-0018837 Colorectal Cancer Colon Adenocarcinoma 60.0 Male Primary 0.351778 3.727190 34.060 LIVING 0.45 Stable 5.3 P-0018837-T01-IM6-17_7578406_C_T Missense_Mutation 1.000 0.325843 p.Arg175His 175 SNP None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None 1 3 3.0 0.0 25231975 CNLOH BEFORE & LOSS 0.257400 True 5 5 2 2 0 2 0.468198 2.178392 NaN NaN NaN NaN 0.821608 NaN NaN NaN NaN 175 None None None None SNV None None None None no_bi uncertain_mono True 1_loh tp53_res None 18 10 8 4 0.650
6 P-0019444-T01-IM6_P-0019444-N01-IM6 P-0019444-T01-IM6 P-0019444 Non-Small Cell Lung Cancer Lung Adenocarcinoma 60.0 Male Primary 0.292235 2.929398 31.068 LIVING 0.30 Stable 36.9 P-0019444-T01-IM6-17_7578461_C_A Missense_Mutation 1.000 0.169002 p.Val157Phe 157 SNP None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None 1 3 2.0 1.0 52853241 LOSS AFTER 0.292235 True 38 41 5 5 4 1 0.311848 1.325617 NaN NaN NaN NaN 1.674383 NaN NaN NaN NaN missense None None None None SNV None None None None no_bi uncertain_mono False 1_noloh tp53_res None 22 21 0 4 0.967
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6359 P-0050468-T01-IM6_P-0050468-N01-IM6 P-0050468-T01-IM6 P-0050468 Cancer of Unknown Primary Cancer of Unknown Primary 71.0 Male Metastasis 0.281318 3.045449 1.874 DECEASED 0.06 Stable 3.5 P-0050468-T01-IM6-17_7574018_G_A Missense_Mutation 0.964 0.271255 p.Arg337Cys 337 SNP None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None 1 2 2.0 0.0 41297581 LOSS BEFORE 0.270982 True 3 3 2 2 1 1 0.359589 1.928458 NaN NaN NaN NaN 0.071542 NaN NaN NaN NaN missense None None None None SNV None None None None bi bi True 1_loh bi bi 19 19 0 2 0.869
6360 P-0050674-T01-IM6_P-0050674-N01-IM6 P-0050674-T01-IM6 P-0050674 Pancreatic Cancer Pancreatic Adenocarcinoma 77.0 Male Metastasis 0.295521 4.477179 1.775 LIVING 0.27 Stable 2.6 P-0050674-T01-IM6-17_7577570_C_T Missense_Mutation 0.913 0.269811 p.Met237Ile 237 SNP None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None 1 2 2.0 0.0 21421136 LOSS BEFORE 0.262533 True 2 2 2 2 2 0 0.437500 1.826004 NaN NaN NaN NaN 0.173996 NaN NaN NaN NaN missense None None None None SNV None None None None bi bi True 1_loh bi bi 15 10 6 1 0.769
6361 P-0050669-T01-IM6_P-0050669-N01-IM6 P-0050669-T01-IM6 P-0050669 Ovarian Cancer High-Grade Serous Fallopian Tube Cancer 41.0 Female Primary 0.637791 3.053874 1.775 LIVING 1.41 Stable 4.4 P-0050669-T01-IM6-17_7579414_C_T Nonsense_Mutation 1.000 0.693844 p.Trp91Ter 91 SNP None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None 1 2 2.0 0.0 78823581 LOSS BEFORE 0.637791 True 4 4 1 1 0 1 0.693844 2.175771 NaN NaN NaN NaN -0.175771 NaN NaN NaN NaN truncated None None None None SNV None None None None bi bi True 1_loh bi bi 15 14 2 1 0.904
6363 P-0009110-T02-IM6_P-0009110-N01-IM6 P-0009110-T02-IM6 P-0009110 Non-Small Cell Lung Cancer Lung Adenocarcinoma 57.0 Female Metastasis 0.239690 3.480349 49.414 LIVING 0.00 Stable 30.7 P-0009110-T02-IM6-17_7576853_C_A Missense_Mutation 1.000 0.325806 p.Gln331His 331 SNP None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None 1 4 4.0 0.0 7438495 CNLOH BEFORE 0.239690 True 28 33 6 6 5 1 0.325806 3.370179 NaN NaN NaN NaN 0.629821 NaN NaN NaN NaN missense None None None None SNV None None None None bi bi True 1_loh bi bi 21 15 4 3 0.871
6365 P-0050675-T01-IM6_P-0050675-N01-IM6 P-0050675-T01-IM6 P-0050675 Cancer of Unknown Primary Cancer of Unknown Primary 64.0 Female Primary 0.386871 2.760248 1.973 LIVING 1.32 Stable 23.7 P-0050675-T01-IM6-17_7579394_G_C Missense_Mutation 1.000 0.422790 p.Pro98Arg 98 SNP None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None None None NaN NaN None None None 1 2 2.0 0.0 21375632 LOSS BEFORE 0.376084 True 24 26 3 3 2 1 0.481805 2.185693 NaN NaN NaN NaN -0.185693 NaN NaN NaN NaN missense None None None None SNV None None None None bi bi True 1_loh bi bi 18 17 1 4 0.816

4228 rows × 96 columns

In [13]:
get_groupby(master_wgd[master_wgd['tp53_count'] >=1],'tp53_cn_state', 'count')[:4].sum()
Out[13]:
count    1030
dtype: int64

What makes Colorectal Cancer a TextBook Case?

WGD Proportion

Non-Small Cell Lung Cancer is one of the biggest cancer types in our cohort. It is the fourth cancer in term of WGD proportion with an average proportion of WGD - around 42%

Cancer Panel

  • NSCLC is the third most represented cancer in MSK-Impact Cohort.
  • Slightly enriched for Primary Samples
  • In comparison with other cancer types, Non-Small Cell Lung Cancer is not particularly enriched nor depleted for a tp53 subgroup.

Genome Instability

Non-Small Cell Lung Cancer Cancer shows a significant difference in Genome Instability between TP53 Mono-Allelic and Bi-Allelic subgroups - and has a lot of samples in both groups.

In the TP53 subgroup Pan Cancer plot that follows, we can see 3 important signals:

  • The GI difference between 0_HETLOSS and 1_WILD_TYPE, two likely mono-allelic subgroups
  • 1_WILD_TYPE GI is very low compared to other subgroups
  • Bi Allelic Subgroups - >=1_LOSS and >=1_cnLOH - have higher GI than other subgroups and the difference is significant

WGD Part

Subgroup Proportion

2 main key points:

  • No enrichment in Pre-WGD TP53 Bi-Allelic for NSCLC
  • Enrichment in tp53 mutation and TP53 LOH samples

Very High Genome Instability

Non WGD Part - Cancer Investigation

In this section, our goal is to find subcohorts that lead the signals observed. Here are the different subcohort we will create:

  • Hotspot Analysis: splitting on 273 / 248 / 175 / Missense / Truncated / In Frame
  • CCF Analysis
  • SNV/INDEL Analysis

Hotspot Analysis

In this section, we cut our cohort to only keep samples with exactly one TP53 mutation, for simplicity.

In [8]:
master_hotspot = master_no_wgd_cancer[master_no_wgd_cancer['tp53_count'] == 1]
In [9]:
get_hotspot_frac(df=master_hotspot,
                group_type=None,
                group=None)
Out[9]:
0 1 2
0 spot # frac
1 nan 61 0.373
2 248 26 0.409
3 273 20 0.3145
4 158 19 0.301
5 245 15 0.377
6 175 15 0.39
7 157 13 0.427
8 179 12 0.479
9 280 10 0.317
10 213 9 0.208

Entire Cohort

In [13]:
h = get_groupby(master_hotspot,'tp53_vc_group_1', 'count').sort_values(by='count', ascending=False)
display(h)

h = h.T
h = h[mutation_list]
fig = plt.figure(figsize=(6,1))
ax = plt.subplot()

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax, colormap="Accent")
ax.legend(['In Frame', 'Truncated', 'Missense', 'Hotspot 248','Hotspot 273','Hotspot 175', 'Other Hotspot'],loc='center left', bbox_to_anchor=(1.2, 0.5), fontsize=11)
ax.set_title('Mutation Type - {} - No WGD'.format(cancer), weight='bold', fontsize=18)

plt.show()
count
tp53_vc_group_1
missense 296
truncated 202
hotspot 30
248 26
273 20
175 14
in_frame 10
  • NSCLC has a very small proportion of hotspots.
In [10]:
fig, ax = boxplot_sampletype(df=master_hotspot,
                  group='tp53_vc_group_1',
                  palette=mutation_palette,
                  order=mutation_list,
                  metrics='frac_genome_altered',
                  figsize=(6,10),
                  title='Fraction of Genome Altered - {}'.format(cancer),
                  xlim=[0,1])
plt.show()
  • Once more, it seems that 273 hotspot leads to lower GI

TP53 Residual Subgroups

In [34]:
print('Number of Bi Allelic samples (with 1 mut): ' + str(master_hotspot[master_hotspot['tp53_res_group'] == 'no_tp53_res'].shape[0]))
print('')
print('Number of  TP53 Residual samples (with 1 mut): ' + str(master_hotspot[master_hotspot['tp53_res_group'] == 'tp53_res'].shape[0]))
Number of Bi Allelic samples (with 1 mut): 497

Number of  TP53 Residual samples (with 1 mut): 62
In [41]:
total_df = []
for group in ['tp53_res', 'no_tp53_res']:
    h = get_groupby(master_hotspot[master_hotspot['tp53_res_group'] == group], 'tp53_vc_group_1', group).sort_values(by=group, ascending=False)
    total_df.append(h)
    
    h=h.T
    
    for mut in mutation_list:
        if mut not in h.columns:
            h[mut] = 0

    h = h[mutation_list]
    fig = plt.figure(figsize=(6,1))
    ax = plt.subplot()

    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax, colormap="Accent")
    if group == 'tp53_res':
        ax.legend(['In Frame', 'Truncated', 'Missense', 'Hotspot 248','Hotspot 273','Hotspot 175', 'Other Hotspot'],loc='center left', bbox_to_anchor=(1.05, 0.5), fontsize=11)
    else: ax.get_legend().remove()
    ax.set_title('Mutation Type - {} - No WGD'.format(group), weight='bold', fontsize=18)

    plt.show()

display_side_by_side(total_df[0],total_df[1])
tp53_res
tp53_vc_group_1
missense 32
truncated 17
248 5
273 5
hotspot 3
no_tp53_res
tp53_vc_group_1
missense 243
truncated 172
hotspot 25
248 19
175 14
273 14
in_frame 10
In [44]:
for group in ['tp53_res', 'no_tp53_res']:
    master_wt = master_hotspot[master_hotspot['tp53_res_group'] == group]

    fig, ax = boxplot_sampletype(df=master_wt,
                      group='tp53_vc_group_1',
                      palette=mutation_palette,
                      order=mutation_list,
                      metrics='frac_genome_altered',
                      figsize=(6,10),
                      title='Fraction of Genome Altered - No WGD - {} subgroup'.format(group),
                      xlim=[0,1])
    plt.show()

SNV / INDEL Analysis

In this section we compare SNV and INDEL mutations. As in the previous section, we cut the cohort to keep only samples with exactly 1 tp53 mutation.

# of Drivers / SNV Drivers / INDEL Drivers

In [14]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
                  group='tp53_group',
                  palette=palette,
                  order=group_list,
                  metrics='driver_mutation_count',
                  figsize=(8,12),
                  title='Driver Mutation Count - TP53 Subroups - No WGD',
                  xlim=[-0.1,10])
plt.show()

display_side_by_side(get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='driver_mutation_count', 
               group_list=['1_WILD_TYPE', '0_HETLOSS']),
       
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='driver_mutation_count', 
               group_list=['1_WILD_TYPE', '>=1_LOSS']),
        
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='driver_mutation_count', 
               group_list=['>1muts', '>=1_LOSS']))
0 1 2
0 size driver_mutation_count
1 1_WILD_TYPE 62 2
2 0_HETLOSS 294 2
3 Statistics p-value
4 -1.489 0.136487
0 1 2
0 size driver_mutation_count
1 1_WILD_TYPE 62 2
2 >=1_LOSS 444 2
3 Statistics p-value
4 -2.61208 0.00899937
0 1 2
0 size driver_mutation_count
1 >1muts 40 3
2 >=1_LOSS 444 2
3 Statistics p-value
4 1.52969 0.126094
In [15]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
                  group='tp53_group',
                  palette=palette,
                  order=group_list,
                  metrics='snv_driver_mutation_count',
                  figsize=(8,12),
                  title='SNV Driver Mutation Count - TP53 Subroups - No WGD',
                  xlim=[-0.1,15])
plt.show()

display_side_by_side(get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='snv_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '0_HETLOSS']),
       
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='snv_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '>=1_LOSS']),
        
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='snv_driver_mutation_count', 
               group_list=['>1muts', '>=1_LOSS']))
0 1 2
0 size snv_driver_mutation_count
1 1_WILD_TYPE 62 1
2 0_HETLOSS 294 1
3 Statistics p-value
4 -0.769284 0.441725
0 1 2
0 size snv_driver_mutation_count
1 1_WILD_TYPE 62 1
2 >=1_LOSS 444 2
3 Statistics p-value
4 -1.81603 0.0693659
0 1 2
0 size snv_driver_mutation_count
1 >1muts 40 2
2 >=1_LOSS 444 2
3 Statistics p-value
4 1.64654 0.0996525
In [16]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
                  group='tp53_group',
                  palette=palette,
                  order=group_list,
                  metrics='indel_driver_mutation_count',
                  figsize=(8,12),
                  title='INDEL Driver Mutation Count - TP53 Subroups - No WGD',
                  xlim=[-0.1,35])
plt.show()

display_side_by_side(get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '0_HETLOSS']),
       
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['1_WILD_TYPE', '>=1_LOSS']),
        
       get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='indel_driver_mutation_count', 
               group_list=['>1muts', '>=1_LOSS']))
0 1 2
0 size indel_driver_mutation_count
1 1_WILD_TYPE 62 0
2 0_HETLOSS 294 0
3 Statistics p-value
4 -1.73887 0.082058
0 1 2
0 size indel_driver_mutation_count
1 1_WILD_TYPE 62 0
2 >=1_LOSS 444 0
3 Statistics p-value
4 -2.12434 0.0336416
0 1 2
0 size indel_driver_mutation_count
1 >1muts 40 0
2 >=1_LOSS 444 0
3 Statistics p-value
4 -0.614945 0.538591

Here, one major information:

  • In NSCLC, there are very few numbers of Co-Drivers, INDEL Co-Drivers nor SNV Co-Drivers.
    • Maybe TP53 mutations in NSCLC are very pervasvie and do not need more drivers to expand

Genome Instability

The idea here is to see if we have differences in Fraction of Genome Altered if we cut our Cancer cohort on the number of drivers per sample.

Do we have more instability with more INDEL Driver Mutations within the same subgroup?

1_WILD_TYPE Subgroup

In [19]:
master_no_wgd_cancer_wt = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '1_WILD_TYPE']

thr=1

def get_driver_groups(x):
    if x.indel_driver_mutation_count > thr:
        return 'High Co-Driver Count'
    if x.indel_driver_mutation_count <= thr:
        return 'Low Co-Driver Count'
    

master_no_wgd_cancer_wt['co_driver_group'] = master_no_wgd_cancer_wt.apply(get_driver_groups, axis=1)
In [20]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_wt,
                  group='co_driver_group',
                  palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
                  order=['High Co-Driver Count', 'Low Co-Driver Count'],
                  metrics='frac_genome_altered',
                  figsize=(4,10),
                  title='Fraction of Genome Altered - 1_WILD_TYPE subgroup - Co Driver Count (thr={}) - {}'.format(thr,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_no_wgd_cancer_wt,
               group='co_driver_group',
               metrics='frac_genome_altered',
               group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
Out[20]:
0 1 2
0 size frac_genome_altered
1 High Co-Driver Count 3 0.068
2 Low Co-Driver Count 59 0.165
3 Statistics p-value
4 -0.213229 0.831148

0_HETLOSS

In [21]:
master_no_wgd_cancer_het = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '0_HETLOSS']

thr=1

def get_driver_groups(x):
    if x.indel_driver_mutation_count > thr:
        return 'High Co-Driver Count'
    if x.indel_driver_mutation_count <= thr:
        return 'Low Co-Driver Count'
    

master_no_wgd_cancer_het['co_driver_group'] = master_no_wgd_cancer_het.apply(get_driver_groups, axis=1)
In [22]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_het,
                  group='co_driver_group',
                  palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
                  order=['High Co-Driver Count', 'Low Co-Driver Count'],
                  metrics='frac_genome_altered',
                  figsize=(4,10),
                  title='Fraction of Genome Altered - 0_HETLOSS subgroup - Co Driver Count (thr={}) - {}'.format(thr,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_no_wgd_cancer_het,
               group='co_driver_group',
               metrics='frac_genome_altered',
               group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
Out[22]:
0 1 2
0 size frac_genome_altered
1 High Co-Driver Count 38 0.315
2 Low Co-Driver Count 256 0.342
3 Statistics p-value
4 -0.522467 0.601345
In [ ]:
 

Co Driver Analysis

In [29]:
codrivers_cancer = get_major_codrivers(master=master_no_wgd_cancer,
                    maf=maf_cohort_nowgd,
                    head=15)

codrivers_cancer_tp53 = get_major_codrivers(master=master_no_wgd_cancer[master_no_wgd_cancer['tp53_count'] >= 1],
                    maf=maf_cohort_nowgd,
                    head=15)
In [30]:
co_drivers = pd.merge(codrivers_cancer, codrivers_cancer_tp53, on='Hugo_Symbol')
co_drivers.columns = ['cancer', 'cancer_tp53']
In [31]:
co_drivers['ratio'] = co_drivers.apply(lambda x: 100*round(x.cancer_tp53/x.cancer, 4) , axis=1)
co_drivers = co_drivers.sort_values(by='ratio', ascending=False)
In [32]:
co_drivers
Out[32]:
cancer cancer_tp53 ratio
Hugo_Symbol
RB1 59 45 76.27
CDKN2A 112 81 72.32
PIK3CA 116 53 45.69
BRAF 81 35 43.21
NF1 91 36 39.56
ARID1A 78 30 38.46
EGFR 452 146 32.30
SETD2 89 25 28.09
KEAP1 129 36 27.91
KRAS 594 148 24.92
RBM10 155 37 23.87
STK11 262 47 17.94
In [34]:
codrivers_cancer
Out[34]:
count
Hugo_Symbol
KRAS 594
EGFR 452
STK11 262
RBM10 155
KEAP1 129
PIK3CA 116
CDKN2A 112
NF1 91
SETD2 89
BRAF 81
CTNNB1 78
ARID1A 78
SMARCA4 72
MGA 62
RB1 59
In [74]:
labels = []
codrivers_cancer = get_major_codrivers(master=master_no_wgd_cancer,
                                       maf=maf_cohort_nowgd,
                                       head=15,
                                       tp53=True)

codrivers_cancer['proportion'] = codrivers_cancer.apply(lambda x: 100* round(x['count'] / codrivers_cancer.sum().values[0], 4), axis=1)

for element in codrivers_cancer.head(15).index.tolist():
    labels.append(element + ' ('+ str(int(codrivers_cancer.loc[element]['count']))+')')

ax = sns.barplot(y=codrivers_cancer.head(15).index, x='proportion',data=codrivers_cancer.head(15)[['proportion']], color='#7F8C8D', saturation=.2)
ax.set_yticklabels(labels)
ax.set_title('Drivers Frequency in {}'.format(cancer))
Out[74]:
Text(0.5, 1.0, 'Drivers Frequency in Non-Small Cell Lung Cancer')
In [33]:
labels = []
for element in co_drivers.index.tolist():
    labels.append(element + ' ('+ str(int(co_drivers.loc[element]['cancer']))+')')

ax = sns.barplot(y=co_drivers.index, x='ratio',data=co_drivers[['ratio']], color='#7F8C8D', saturation=.2)
ax.set_yticklabels(labels)
ax.set_title('Co-Drivers Enrichment in TP53 State')
Out[33]:
Text(0.5, 1.0, 'Co-Drivers Enrichment in TP53 State')

Co-Drivers per subgroup

In [87]:
co_drivers_res = create_co_drivers_table(master=master_no_wgd_cancer, 
                                                group_type='tp53_res_group',
                                                group_1='tp53_res',
                                                group_2='no_tp53_res')
co_drivers_res
Out[87]:
count_x proportion_1 count_y proportion_2
Hugo_Symbol
EGFR 128 -16.54 114 8.89
KRAS 115 -14.86 120 9.35
STK11 44 -5.68 41 3.20
CTNNB1 24 -3.10 9 0.70
RBM10 24 -3.10 32 2.49
NF1 21 -2.71 30 2.34
PIK3CA 20 -2.58 46 3.59
SMAD4 16 -2.07 18 1.40
ARID1A 16 -2.07 28 2.18
BRAF 15 -1.94 30 2.34
CDKN2A 15 -1.94 70 5.46
SETD2 15 -1.94 24 1.87
MET 14 -1.81 12 0.94
SMARCA4 13 -1.68 19 1.48
MGA 13 -1.68 17 1.33
ATM 13 -1.68 6 0.47
TERT 11 -1.42 22 1.71
KEAP1 11 -1.42 34 2.65
ERBB2 11 -1.42 20 1.56
RB1 10 -1.29 40 3.12
PTEN 9 -1.16 30 2.34
NF2 8 -1.03 6 0.47
APC 8 -1.03 16 1.25
EP300 8 -1.03 4 0.31
ATRX 8 -1.03 11 0.86
U2AF1 7 -0.90 6 0.47
MAX 7 -0.90 7 0.55
NFE2L2 7 -0.90 15 1.17
TGFBR2 7 -0.90 5 0.39
KMT2C 6 -0.78 12 0.94
KMT2D 6 -0.78 33 2.57
PBRM1 5 -0.65 15 1.17
KDM5C 4 -0.52 14 1.09
MTOR 4 -0.52 3 0.23
TSC2 4 -0.52 3 0.23
TET2 4 -0.52 7 0.55
PIK3R1 4 -0.52 9 0.70
ARID2 3 -0.39 15 1.17
NOTCH3 3 -0.39 9 0.70
GRIN2A 3 -0.39 6 0.47
INPP4B 3 -0.39 5 0.39
PTPRT 3 -0.39 9 0.70
TSC1 3 -0.39 6 0.47
KMT2A 3 -0.39 4 0.31
DICER1 3 -0.39 6 0.47
MED12 3 -0.39 12 0.94
PTPRD 3 -0.39 8 0.62
EPHA3 3 -0.39 7 0.55
AMER1 2 -0.26 3 0.23
KDM6A 2 -0.26 3 0.23
ALK 2 -0.26 6 0.47
FBXW7 2 -0.26 14 1.09
POLE 2 -0.26 5 0.39
ZFHX3 2 -0.26 8 0.62
BRCA2 2 -0.26 7 0.55
CREBBP 2 -0.26 7 0.55
BCOR 2 -0.26 8 0.62
SMAD2 2 -0.26 3 0.23
FUBP1 2 -0.26 4 0.31
PTPN11 1 -0.13 3 0.23
RASA1 1 -0.13 15 1.17
STAG2 1 -0.13 7 0.55
SPEN 1 -0.13 4 0.31
MAP2K1 1 -0.13 3 0.23
CIC 1 -0.13 5 0.39
In [136]:
fig=plt.figure(figsize=(7,7))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_res[['proportion_1', 'proportion_2']].head(10)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = ['#2ECC71','#1E8449'])
ax.legend(['TP53 Residual', 'No TP53 Residual'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [20, 15, 10, 5, 0, 5, 10, 15]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()
[-20.0, -15.0, -10.0, -5.0, 0.0, 5.0, 10.0, 15.0]
In [36]:
co_drivers_cnloh_loss = create_co_drivers_table(master=master_no_wgd_cancer, 
                                                group_type='tp53_group',
                                                group_1='>=1_cnLOH',
                                                group_2='>=1_LOSS')
co_drivers_cnloh_loss
Out[36]:
count_x proportion_1 count_y proportion_2
Hugo_Symbol
EGFR 21 -9.01 99 9.45
KRAS 17 -7.30 104 9.92
CDKN2A 12 -5.15 57 5.44
PTEN 10 -4.29 19 1.81
PIK3CA 9 -3.86 38 3.63
RB1 8 -3.43 32 3.05
RASA1 8 -3.43 6 0.57
PBRM1 8 -3.43 8 0.76
STK11 6 -2.58 35 3.34
NF1 5 -2.15 23 2.19
KMT2D 5 -2.15 23 2.19
TERT 4 -1.72 16 1.53
KEAP1 4 -1.72 28 2.67
CREBBP 4 -1.72 3 0.29
PTPRD 3 -1.29 3 0.29
FBXW7 3 -1.29 10 0.95
NOTCH3 3 -1.29 6 0.57
MED12 3 -1.29 8 0.76
BRAF 3 -1.29 27 2.58
NOTCH1 3 -1.29 6 0.57
ARID1A 3 -1.29 21 2.00
RBM10 3 -1.29 31 2.96
GRIN2A 2 -0.86 4 0.38
PIK3R1 2 -0.86 6 0.57
NF2 2 -0.86 4 0.38
PTPRT 2 -0.86 6 0.57
NOTCH2 2 -0.86 2 0.19
APC 2 -0.86 9 0.86
SMAD4 2 -0.86 15 1.43
SMARCA4 2 -0.86 17 1.62
ATRX 2 -0.86 7 0.67
FAT1 2 -0.86 9 0.86
ERBB2 2 -0.86 19 1.81
TP63 1 -0.43 3 0.29
NSD1 1 -0.43 3 0.29
TSC1 1 -0.43 5 0.48
TGFBR2 1 -0.43 4 0.38
POLE 1 -0.43 4 0.38
TET2 1 -0.43 6 0.57
TET1 1 -0.43 5 0.48
SPEN 1 -0.43 3 0.29
NRAS 1 -0.43 5 0.48
SETD2 1 -0.43 22 2.10
MET 1 -0.43 11 1.05
NOTCH4 1 -0.43 5 0.48
EPHA3 1 -0.43 5 0.48
FGFR3 1 -0.43 5 0.48
ELF3 1 -0.43 2 0.19
NFE2L2 1 -0.43 13 1.24
CTNNB1 1 -0.43 7 0.67
CIC 1 -0.43 3 0.29
BRCA2 1 -0.43 4 0.38
B2M 1 -0.43 7 0.67
ARID1B 1 -0.43 4 0.38
INPP4B 1 -0.43 4 0.38
INPPL1 1 -0.43 7 0.67
KDM5C 1 -0.43 13 1.24
KMT2C 1 -0.43 8 0.76
MGA 1 -0.43 17 1.62
MSH3 1 -0.43 3 0.29
ZFHX3 1 -0.43 7 0.67
In [37]:
labels = []
for element in co_drivers_cnloh_loss.head(10).index.tolist():
    labels.append(element + ' ('+ str(int(co_drivers_cnloh_loss.loc[element]['count_x']))+')')

ax = sns.barplot(y=co_drivers_cnloh_loss.head(10).index, x='proportion_1',data=co_drivers_cnloh_loss.head(10)[['proportion_1']], color='#7F8C8D', saturation=.2)
ax.set_yticklabels(labels)
ax.set_title('Co-Drivers Frequency in {} - {}'.format(subgroup, cancer))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-37-ef47ed169b5d> in <module>
      5 ax = sns.barplot(y=co_drivers_cnloh_loss.head(10).index, x='proportion_1',data=co_drivers_cnloh_loss.head(10)[['proportion_1']], color='#7F8C8D', saturation=.2)
      6 ax.set_yticklabels(labels)
----> 7 ax.set_title('Co-Drivers Frequency in {} - {}'.format(subgroup, cancer))

NameError: name 'subgroup' is not defined
In [41]:
fig=plt.figure(figsize=(8,8))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_cnloh_loss[['proportion_1', 'proportion_2']].head(15)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[4],mc[0]])
ax.legend(['>=1_cnLOH', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [-10, -7.5, -5, -2.5, 0, 2.5, 5, 7.5, 10, 12.5]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()
  • The big picture is the same for both cancers
  • Enrichment in PTEN and RASA1 / PBRM1 for >=1_cnLOH

In [47]:
co_drivers_losses = create_co_drivers_table(master=master_no_wgd_cancer, 
                                                group_type='tp53_group',
                                                group_1='0_HETLOSS',
                                                group_2='>=1_LOSS')
co_drivers_losses
Out[47]:
count_x proportion_1 count_y proportion_2
Hugo_Symbol
EGFR 108 -17.01 99 9.45
KRAS 94 -14.80 104 9.92
STK11 39 -6.14 35 3.34
RBM10 23 -3.62 31 2.96
CTNNB1 23 -3.62 7 0.67
NF1 17 -2.68 23 2.19
PIK3CA 16 -2.52 38 3.63
MGA 13 -2.05 17 1.62
SMAD4 13 -2.05 15 1.43
SETD2 13 -2.05 22 2.10
BRAF 12 -1.89 27 2.58
ARID1A 12 -1.89 21 2.00
MET 11 -1.73 11 1.05
KEAP1 11 -1.73 28 2.67
ATM 10 -1.57 7 0.67
SMARCA4 10 -1.57 17 1.62
ERBB2 10 -1.57 19 1.81
TERT 9 -1.42 16 1.53
NF2 8 -1.26 4 0.38
CDKN2A 7 -1.10 57 5.44
EP300 7 -1.10 4 0.38
ATRX 7 -1.10 7 0.67
TGFBR2 6 -0.94 4 0.38
RB1 6 -0.94 32 3.05
APC 6 -0.94 9 0.86
PTEN 6 -0.94 19 1.81
MAX 5 -0.79 8 0.76
KMT2C 5 -0.79 8 0.76
KMT2D 4 -0.63 23 2.19
NFE2L2 4 -0.63 13 1.24
MTOR 3 -0.47 3 0.29
TSC1 3 -0.47 5 0.48
ARID2 3 -0.47 15 1.43
PIK3R1 3 -0.47 6 0.57
U2AF1 3 -0.47 7 0.67
PTPRD 3 -0.47 3 0.29
TET2 3 -0.47 6 0.57
INPP4B 3 -0.47 4 0.38
KDM5C 2 -0.31 13 1.24
AMER1 2 -0.31 3 0.29
PBRM1 2 -0.31 8 0.76
POLE 2 -0.31 4 0.38
KMT2A 2 -0.31 4 0.38
PPP2R1A 2 -0.31 3 0.29
ALK 2 -0.31 6 0.57
PTPRT 2 -0.31 6 0.57
DICER1 2 -0.31 5 0.48
BCOR 2 -0.31 8 0.76
EPHA3 2 -0.31 5 0.48
BRCA2 2 -0.31 4 0.38
FBXW7 2 -0.31 10 0.95
ZFHX3 2 -0.31 7 0.67
GRIN2A 2 -0.31 4 0.38
STAG2 1 -0.16 6 0.57
SPEN 1 -0.16 3 0.29
PTPN11 1 -0.16 3 0.29
CDK12 1 -0.16 2 0.19
ELF3 1 -0.16 2 0.19
CYLD 1 -0.16 3 0.29
CREBBP 1 -0.16 3 0.29
CIC 1 -0.16 3 0.29
BRIP1 1 -0.16 5 0.48
BAP1 1 -0.16 3 0.29
B2M 1 -0.16 7 0.67
In [51]:
fig=plt.figure(figsize=(7,7))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_losses[['proportion_1', 'proportion_2']].head(15)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[5],mc[0]])
ax.legend(['0_HETLOSS', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [-20, -15, -10, -5, 0, 5, 10, 15]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()
  • Enrichment in EGFR, KRAS, STK11 for 0_HETLOSS

In [48]:
co_drivers_mult_cnloh = create_co_drivers_table(master=master_no_wgd_cancer, 
                                                group_type='tp53_group',
                                                group_1='>1muts',
                                                group_2='>=1_cnLOH')
co_drivers_mult_cnloh
Out[48]:
count_x proportion_1 count_y proportion_2
Hugo_Symbol
EGFR 6 -5.17 21 9.01
KRAS 6 -5.17 17 7.30
CDKN2A 6 -5.17 12 5.15
KMT2D 5 -4.31 5 2.15
APC 5 -4.31 2 0.86
NF1 5 -4.31 5 2.15
ARID1A 3 -2.59 3 1.29
KMT2C 3 -2.59 1 0.43
RB1 3 -2.59 8 3.43
BRAF 3 -2.59 3 1.29
MET 3 -2.59 1 0.43
FAT1 3 -2.59 2 0.86
MGA 2 -1.72 1 0.43
NFE2L2 2 -1.72 1 0.43
MUTYH 2 -1.72 1 0.43
NSD1 2 -1.72 1 0.43
PIK3CA 2 -1.72 9 3.86
PTEN 2 -1.72 10 4.29
RASA1 2 -1.72 8 3.43
STK11 2 -1.72 6 2.58
KEAP1 2 -1.72 4 1.72
MED12 2 -1.72 3 1.29
INPPL1 1 -0.86 1 0.43
ATRX 1 -0.86 2 0.86
PTPRT 1 -0.86 2 0.86
BARD1 1 -0.86 1 0.43
RBM10 1 -0.86 3 1.29
SMAD4 1 -0.86 2 0.86
SMARCA4 1 -0.86 2 0.86
TERT 1 -0.86 4 1.72
TGFBR2 1 -0.86 1 0.43
TSC2 1 -0.86 2 0.86
PTPRD 1 -0.86 3 1.29
GRIN2A 1 -0.86 2 0.86
FUBP1 1 -0.86 2 0.86
BRCA2 1 -0.86 1 0.43
CREBBP 1 -0.86 4 1.72
CIC 1 -0.86 1 0.43
PBRM1 1 -0.86 8 3.43
In [52]:
get_major_codrivers(master=master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>1muts'],
                    maf=maf_cohort_nowgd,
                    head=100)
Out[52]:
count
Hugo_Symbol
EGFR 6
KRAS 6
CDKN2A 6
KMT2D 5
APC 5
NF1 5
ARID1A 3
KMT2C 3
RB1 3
BRAF 3
MET 3
FAT1 3
ANKRD11 2
MGA 2
NFE2L2 2
MUTYH 2
JAK1 2
NSD1 2
PIK3CA 2
PTEN 2
RASA1 2
SMAD2 2
STK11 2
KEAP1 2
MED12 2
INPPL1 1
ATRX 1
PTPRT 1
BCL2L11 1
BARD1 1
RBM10 1
RUNX1 1
SDHA 1
SDHB 1
SMAD4 1
BLM 1
SMARCA4 1
STAG2 1
ASXL2 1
SUFU 1
TERT 1
TGFBR2 1
TSC2 1
PTPRD 1
POLD1 1
INHA 1
EP300 1
IFNGR1 1
GRIN2A 1
FUBP1 1
MAP3K1 1
MAX 1
FOXO1 1
FOXL2 1
MTOR 1
BRCA2 1
DICER1 1
MYD88 1
CREBBP 1
CIC 1
CDK8 1
PARK2 1
PBRM1 1
U2AF1 1
In [53]:
fig=plt.figure(figsize=(10,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_mult_cnloh[['proportion_1', 'proportion_2']].head(20)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[3],mc[4]])
ax.legend(['>1muts', '>=1_cnLOH'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=15)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
print(a)
#a = [-20, -10, 0, 10, 20, 30, 40]
#ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()
[-6.0, -4.0, -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0]

* Hard to interpret as we have fewersamples in >1muts subgroup

In [57]:
co_drivers_wt_loss = create_co_drivers_table(master=master_no_wgd_cancer, 
                                                group_type='tp53_group',
                                                group_1='1_WILD_TYPE',
                                                group_2='0_HETLOSS')
co_drivers_wt_loss
Out[57]:
count_x proportion_1 count_y proportion_2
Hugo_Symbol
KRAS 17 -15.18 94 14.80
EGFR 15 -13.39 108 17.01
CDKN2A 5 -4.46 7 1.10
STK11 4 -3.57 39 6.14
NF1 3 -2.68 17 2.68
PIK3CA 3 -2.68 16 2.52
ARID1A 3 -2.68 12 1.89
SMAD4 3 -2.68 13 2.05
U2AF1 3 -2.68 3 0.47
ATM 3 -2.68 10 1.57
KDM5C 2 -1.79 2 0.31
RB1 2 -1.79 6 0.94
PBRM1 2 -1.79 2 0.31
TERT 2 -1.79 9 1.42
KMT2D 2 -1.79 4 0.63
PTEN 2 -1.79 6 0.94
SMARCA4 2 -1.79 10 1.57
NFE2L2 2 -1.79 4 0.63
SETD2 2 -1.79 13 2.05
BRAF 2 -1.79 12 1.89
RBM10 1 -0.89 23 3.62
PTPRT 1 -0.89 2 0.31
RUNX1 1 -0.89 1 0.16
PIK3R1 1 -0.89 3 0.47
TET2 1 -0.89 3 0.47
SF3B1 1 -0.89 1 0.16
APC 1 -0.89 6 0.94
MET 1 -0.89 11 1.73
ATRX 1 -0.89 7 1.10
CDK12 1 -0.89 1 0.16
CTNNB1 1 -0.89 23 3.62
DICER1 1 -0.89 2 0.31
EPHA3 1 -0.89 2 0.31
ERBB2 1 -0.89 10 1.57
KMT2A 1 -0.89 2 0.31
KMT2C 1 -0.89 5 0.79
MAX 1 -0.89 5 0.79
In [60]:
fig=plt.figure(figsize=(10,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_wt_loss[['proportion_1', 'proportion_2']].head(20)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[2],mc[5]])
ax.legend(['1_WILD_TYPE', '0_HETLOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=15)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [-20, -15, -10, -5, 0, 5, 10, 15, 20]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()

Comparison with WGD Cohort (WGD - TP53 - LOH)

In [61]:
def get_major_codrivers(master: pd.DataFrame, maf: pd.DataFrame, head:int = 10):
    samples = master.Tumor_Id.tolist()
    maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True][maf['Hugo_Symbol'] != 'TP53']
    h = pd.DataFrame(maf_filtered[['Hugo_Symbol']].groupby(['Hugo_Symbol']).size())
    h.columns = ['count']
    h = h.sort_values(by='count', ascending=False).head(head)
    
    return(h)

def create_co_drivers_table_wgd(master_1: pd.DataFrame, master_2: pd.DataFrame, group_type:str,  group_1: str):
    master_group_1 = master_1[master_1[group_type] == group_1]
    co_drivers_group_1 = get_major_codrivers(master=master_group_1,
                        maf=maf_cohort_nowgd,
                       head=100)
    co_drivers_group_1['proportion_1'] = co_drivers_group_1.apply(lambda x: 100* round(x['count'] / co_drivers_group_1.sum().values[0], 4), axis=1)
    
    master_group_2 = master_2[master_2['tp53_count'] >=1][master_2['tp53_loh_status'] == True]
    co_drivers_group_2 = get_major_codrivers(master=master_group_2,
                        maf=maf_cohort_wgd,
                       head=100)
    co_drivers_group_2['proportion_2'] = co_drivers_group_2.apply(lambda x: 100* round(x['count'] / co_drivers_group_2.sum().values[0], 4), axis=1)
    
    co_drivers_groups = pd.merge(co_drivers_group_2, co_drivers_group_1, on='Hugo_Symbol')
    co_drivers_groups['proportion_2'] = - co_drivers_groups['proportion_2']
    
    return co_drivers_groups
In [62]:
co_drivers_wgd_loss = create_co_drivers_table_wgd(master_1=master_no_wgd_cancer,
                            master_2=master_wgd_cancer,
                            group_type='tp53_group',
                            group_1='>=1_LOSS')
co_drivers_wgd_loss
Out[62]:
count_x proportion_2 count_y proportion_1
Hugo_Symbol
EGFR 302 -16.29 99 9.45
KRAS 112 -6.04 104 9.92
CDKN2A 78 -4.21 57 5.44
RB1 70 -3.78 32 3.05
NF1 69 -3.72 23 2.19
STK11 62 -3.34 35 3.34
SMARCA4 54 -2.91 17 1.62
PIK3CA 53 -2.86 38 3.63
KEAP1 53 -2.86 28 2.67
ARID1A 47 -2.54 21 2.00
KMT2D 40 -2.16 23 2.19
RBM10 39 -2.10 31 2.96
PTEN 31 -1.67 19 1.81
ARID2 30 -1.62 15 1.43
NFE2L2 30 -1.62 13 1.24
APC 30 -1.62 9 0.86
ERBB2 23 -1.24 19 1.81
MGA 22 -1.19 17 1.62
KMT2C 22 -1.19 8 0.76
PTPRT 22 -1.19 6 0.57
PBRM1 20 -1.08 8 0.76
TET2 20 -1.08 6 0.57
BRAF 20 -1.08 27 2.58
RASA1 19 -1.02 6 0.57
CTNNB1 19 -1.02 7 0.67
TERT 18 -0.97 16 1.53
FAT1 18 -0.97 9 0.86
SETD2 18 -0.97 22 2.10
PTPRD 18 -0.97 3 0.29
MET 18 -0.97 11 1.05
SPEN 16 -0.86 3 0.29
SMAD4 14 -0.76 15 1.43
EPHA3 14 -0.76 5 0.48
NOTCH1 13 -0.70 6 0.57
MED12 13 -0.70 8 0.76
ZFHX3 11 -0.59 7 0.67
EP300 11 -0.59 4 0.38
PIK3R1 10 -0.54 6 0.57
PARK2 10 -0.54 2 0.19
ATRX 10 -0.54 7 0.67
FBXW7 10 -0.54 10 0.95
POLE 9 -0.49 4 0.38
ATM 9 -0.49 7 0.67
NOTCH3 9 -0.49 6 0.57
MAP2K1 9 -0.49 3 0.29
B2M 8 -0.43 7 0.67
EPHB1 8 -0.43 2 0.19
CBL 8 -0.43 3 0.29
DICER1 8 -0.43 5 0.48
CREBBP 8 -0.43 3 0.29
ARID1B 8 -0.43 4 0.38
INPP4B 7 -0.38 4 0.38
GRIN2A 7 -0.38 4 0.38
TET1 7 -0.38 5 0.48
ASXL1 7 -0.38 4 0.38
U2AF1 7 -0.38 7 0.67
NF2 7 -0.38 4 0.38
TSC1 6 -0.32 5 0.48
NRAS 6 -0.32 5 0.48
BRCA1 6 -0.32 4 0.38
BRCA2 6 -0.32 4 0.38
INHBA 5 -0.27 2 0.19
KDM5C 5 -0.27 13 1.24
TP63 5 -0.27 3 0.29
NOTCH2 5 -0.27 2 0.19
MAX 5 -0.27 8 0.76
HLA-A 5 -0.27 5 0.48
PALB2 4 -0.22 3 0.29
NSD1 4 -0.22 3 0.29
NOTCH4 4 -0.22 5 0.48
STAG2 4 -0.22 6 0.57
DNMT3A 4 -0.22 6 0.57
CDK12 4 -0.22 2 0.19
In [69]:
fig=plt.figure(figsize=(8,8))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

co_drivers_wgd_loss[['proportion_2', 'proportion_1']].head(15)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = ['#7F8C8D',mc[0]])
ax.legend(['WGD - TP53 - LOH', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)

plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [-20, -15, -10, -5, 0, 5, 10, 15]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)

plt.show()

Genome Instability Associated

In [101]:
def get_master_codrivers(master: pd.DataFrame, maf: pd.DataFrame, symbol: str):
    samples = master.Tumor_Id.tolist()
    samples_final = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['Hugo_Symbol'] == symbol].Tumor_Sample_Barcode.tolist()

    master_filtered = master[master.Tumor_Id.isin(samples_final)]
    
    return master_filtered

>=1_cnLOH

In [73]:
master_no_wgd_cancer_cnloh = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>=1_cnLOH']
master_APC = get_master_codrivers(master=master_no_wgd_cancer_cnloh,
                                   maf=maf_cohort_nowgd,
                                   symbol='APC')

master_KRAS = get_master_codrivers(master=master_no_wgd_cancer_cnloh,
                                   maf=maf_cohort_nowgd,
                                   symbol='KRAS')

master_no_wgd_cancer_cnloh['data'] = '>=1_cnLOH'
master_APC['data'] = 'APC'
master_KRAS['data'] = 'KRAS'
master_PIK3CA['data'] = 'PIK3CA'

masters = [master_no_wgd_cancer_cnloh, master_APC, master_KRAS, master_PIK3CA]
allMasters = pd.concat(masters)
In [74]:
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - >=1_cnLOH')
ax.set_xlabel('')
Out[74]:
Text(0.5, 0, '')

>=1_LOSS

In [78]:
master_no_wgd_cancer_loss = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>=1_LOSS']
master_APC = get_master_codrivers(master=master_no_wgd_cancer_loss,
                                   maf=maf_cohort_nowgd,
                                   symbol='APC')

master_KRAS = get_master_codrivers(master=master_no_wgd_cancer_loss,
                                   maf=maf_cohort_nowgd,
                                   symbol='KRAS')

master_PIK3CA = get_master_codrivers(master=master_no_wgd_cancer_loss,
                                   maf=maf_cohort_nowgd,
                                   symbol='PIK3CA')

master_no_wgd_cancer_loss['data'] = '>=1_loss'
master_APC['data'] = 'APC'
master_KRAS['data'] = 'KRAS'
master_PIK3CA['data'] = 'PIK3CA'

masters = [master_no_wgd_cancer_loss, master_APC, master_KRAS, master_PIK3CA]
allMasters = pd.concat(masters)
In [79]:
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - >=1_LOSS')
ax.set_xlabel('')
Out[79]:
Text(0.5, 0, '')

>1muts

In [77]:
master_no_wgd_cancer_muts = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>1muts']
master_APC = get_master_codrivers(master=master_no_wgd_cancer_muts,
                                   maf=maf_cohort_nowgd,
                                   symbol='APC')

master_KRAS = get_master_codrivers(master=master_no_wgd_cancer_muts,
                                   maf=maf_cohort_nowgd,
                                   symbol='KRAS')

master_KMT2D = get_master_codrivers(master=master_no_wgd_cancer_muts,
                                   maf=maf_cohort_nowgd,
                                   symbol='KMT2D')

master_RNF43 = get_master_codrivers(master=master_no_wgd_cancer_muts,
                                   maf=maf_cohort_nowgd,
                                   symbol='RNF43')

master_no_wgd_cancer_muts['data'] = '>1muts'
master_APC['data'] = 'APC'
master_KRAS['data'] = 'KRAS'
master_KMT2D['data'] = 'KMT2D'
master_RNF43['data'] = 'RNF43'

masters = [master_no_wgd_cancer_muts, master_APC, master_KRAS, master_KMT2D, master_RNF43]
allMasters = pd.concat(masters)
In [76]:
#### >1mutsfig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - >1muts')
ax.set_xlabel('')
Out[76]:
Text(0.5, 0, '')

1_WILD_TYPE

In [105]:
master_no_wgd_cancer_wt = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '1_WILD_TYPE']

master_KRAS = get_master_codrivers(master=master_no_wgd_cancer_wt,
                                   maf=maf_cohort_nowgd,
                                   symbol='KRAS')

master_EGFR = get_master_codrivers(master=master_no_wgd_cancer_wt,
                                   maf=maf_cohort_nowgd,
                                   symbol='EGFR')

master_CDKN2A = get_master_codrivers(master=master_no_wgd_cancer_wt,
                                   maf=maf_cohort_nowgd,
                                   symbol='CDKN2A')


master_no_wgd_cancer_wt['data'] = '1_WT'
master_KRAS['data'] = 'KRAS'
master_EGFR['data'] = 'EGFR'
master_CDKN2A['data'] = 'CDKN2A'

masters = [master_no_wgd_cancer_wt, master_EGFR, master_KRAS, master_CDKN2A]
allMasters = pd.concat(masters)
In [108]:
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - 1_WT')
ax.set_xlabel('')
Out[108]:
Text(0.5, 0, '')
In [ ]:
 

CCF / VAF Analysis

Same here we take only samples with exactly 1 tp53 mutation (master_hotspot).

We have to define groups for CCF to see if there are differences between those groups. To have an idea of the CCF distribution we show here the distribution coming from the cancer_panel.

We see that our tp53_ccf distribution is very high for all subgroups except for 1_WILD_TYPEAdd and >1muts. >=1_LOSS is the biggest subgroup - by far - and has a very high CCF median.

It will be hard to cut the cohort based on the CCF. Let's try and see the size of the subcohorts:

In [20]:
master_ccf = master_no_wgd_cancer[(master_no_wgd_cancer['tp53_count'] == 1) | (master_no_wgd_cancer['tp53_group'] == '0_HETLOSS')]
In [22]:
thr_ccf_1 = 0.8 ; thr_ccf_2 = 0.9

def ccf_subgroup(x):
    if x.tp53_ccf_1 <= thr_ccf_1: return 'low'
    elif x.tp53_ccf_1 <= thr_ccf_2: return 'medium'
    elif x.tp53_ccf_1 > thr_ccf_2: return 'high'

master_ccf['ccf_group'] = master_ccf.apply(ccf_subgroup, axis=1)
get_groupby(master_ccf, 'ccf_group', 'count')
Out[22]:
count
ccf_group
high 382
low 131
medium 78
In [83]:
thr_vaf_1 = 0.3 ; thr_vaf_2 = 0.4

def vaf_subgroup(x):
    if x.tp53_vaf_1 <= thr_vaf_1: return 'low'
    elif x.tp53_vaf_1 <= thr_vaf_2: return 'medium'
    elif x.tp53_vaf_1 > thr_vaf_2: return 'high'

master_ccf['vaf_group'] = master_ccf.apply(vaf_subgroup, axis=1)       
get_groupby(master_ccf, 'vaf_group', 'count')
Out[83]:
count
vaf_group
high 101
low 432
medium 65

VAF Analysis

No VAF Cut

In [61]:
fig, ax = boxplot_sampletype(df=master_ccf,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - {}'.format(cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_ccf,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['0_HETLOSS', '1_WILD_TYPE'])
Out[61]:
0 1 2
0 size frac_genome_altered
1 0_HETLOSS 294 0.3375
2 1_WILD_TYPE 62 0.1505
3 Statistics p-value
4 5.34833 8.87716e-08

Low VAF

In [62]:
master_low = master_ccf[(master_ccf['vaf_group'] == 'low') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_low,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - VAF < {} - {}'.format(thr_vaf_1,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_low,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '>=1_LOSS'])
Out[62]:
0 1 2
0 size frac_genome_altered
1 1_WILD_TYPE 62 0.1505
2 >=1_LOSS 306 0.368
3 Statistics p-value
4 -6.66275 2.68756e-11

Medium VAF

In [63]:
master_med = master_ccf[(master_ccf['vaf_group'] == 'medium') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_med,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - {} < VAF < {} - {}'.format(thr_vaf_1,thr_vaf_2,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_med,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '0_HETLOSS'])
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: '1_WILD_TYPE'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-63-b5c51832bd39> in <module>
      8                   figsize=(5,10),
      9                   title='Fraction of Genome Altered - {} < VAF < {} - {}'.format(thr_vaf_1,thr_vaf_2,cancer),
---> 10                   xlim=[0,1])
     11 plt.show()
     12 

<ipython-input-43-61329b9ecf9a> in boxplot_sampletype(df, group, palette, order, metrics, figsize, title, title_font, xlim)
     46     labels = []
     47     for element in order:
---> 48         labels.append(element + '\n('+ str(groupby_.loc[element].values[0])+')')
     49 
     50 

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1765 
   1766             maybe_callable = com.apply_if_callable(key, self.obj)
-> 1767             return self._getitem_axis(maybe_callable, axis=axis)
   1768 
   1769     def _is_scalar_access(self, key: Tuple):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1962         # fall thru to straight lookup
   1963         self._validate_key(key, axis)
-> 1964         return self._get_label(key, axis=axis)
   1965 
   1966 

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
    622             raise IndexingError("no slices here, handle elsewhere")
    623 
--> 624         return self.obj._xs(label, axis=axis)
    625 
    626     def _get_loc(self, key: int, axis: int):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
   3535             loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
   3536         else:
-> 3537             loc = self.index.get_loc(key)
   3538 
   3539             if isinstance(loc, np.ndarray):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2646                 return self._engine.get_loc(key)
   2647             except KeyError:
-> 2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2649         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2650         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: '1_WILD_TYPE'

High VAF

In [64]:
master_high = master_ccf[(master_ccf['vaf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_high,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - VAF > {} - {}'.format(thr_vaf_2,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_high,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '0_HETLOSS'])
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: '1_WILD_TYPE'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-64-d72ad9dc6c47> in <module>
      8                   figsize=(5,10),
      9                   title='Fraction of Genome Altered - VAF > {} - {}'.format(thr_vaf_2,cancer),
---> 10                   xlim=[0,1])
     11 plt.show()
     12 

<ipython-input-43-61329b9ecf9a> in boxplot_sampletype(df, group, palette, order, metrics, figsize, title, title_font, xlim)
     46     labels = []
     47     for element in order:
---> 48         labels.append(element + '\n('+ str(groupby_.loc[element].values[0])+')')
     49 
     50 

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1765 
   1766             maybe_callable = com.apply_if_callable(key, self.obj)
-> 1767             return self._getitem_axis(maybe_callable, axis=axis)
   1768 
   1769     def _is_scalar_access(self, key: Tuple):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1962         # fall thru to straight lookup
   1963         self._validate_key(key, axis)
-> 1964         return self._get_label(key, axis=axis)
   1965 
   1966 

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
    622             raise IndexingError("no slices here, handle elsewhere")
    623 
--> 624         return self.obj._xs(label, axis=axis)
    625 
    626     def _get_loc(self, key: int, axis: int):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
   3535             loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
   3536         else:
-> 3537             loc = self.index.get_loc(key)
   3538 
   3539             if isinstance(loc, np.ndarray):

/opt/anaconda3/envs/mskimpact_env/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2646                 return self._engine.get_loc(key)
   2647             except KeyError:
-> 2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2649         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2650         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: '1_WILD_TYPE'

CCF Analysis

No CCF Cut

In [65]:
fig, ax = boxplot_sampletype(df=master_ccf,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - {}'.format(cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_ccf,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['0_HETLOSS', '1_WILD_TYPE'])
Out[65]:
0 1 2
0 size frac_genome_altered
1 0_HETLOSS 294 0.3375
2 1_WILD_TYPE 62 0.1505
3 Statistics p-value
4 5.34833 8.87716e-08

Low CCF

In [66]:
master_low = master_ccf[(master_ccf['ccf_group'] == 'low') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_low,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - CCF < {} - {}'.format(thr_ccf_1,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_low,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '0_HETLOSS'])
Out[66]:
0 1 2
0 size frac_genome_altered
1 1_WILD_TYPE 37 0.076
2 0_HETLOSS 294 0.3375
3 Statistics p-value
4 -7.31139 2.64401e-13

Medium CCF

In [67]:
master_med = master_ccf[(master_ccf['ccf_group'] == 'medium') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_med,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - {} < CCF < {} - {}'.format(thr_ccf_1,thr_ccf_2,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_med,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '0_HETLOSS'])
Out[67]:
0 1 2
0 size frac_genome_altered
1 1_WILD_TYPE 2 0.2775
2 0_HETLOSS 294 0.3375
3 Statistics p-value
4 -0.663153 0.507233

High CCF

In [84]:
master_high = master_ccf[(master_ccf['ccf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]

fig, ax = boxplot_sampletype(df=master_high,
                  group='tp53_group',
                  palette=palette,
                  order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
                  metrics='frac_genome_altered',
                  figsize=(5,10),
                  title='Fraction of Genome Altered - CCF > {} - {}'.format(thr_ccf_2,cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_high,
               group='tp53_group',
               metrics='frac_genome_altered',
               group_list=['1_WILD_TYPE', '0_HETLOSS'])
Out[84]:
0 1 2
0 size frac_genome_altered
1 1_WILD_TYPE 19 0.408
2 0_HETLOSS 294 0.3375
3 Statistics p-value
4 1.12079 0.262375

let's check the hotspot distribution

In [86]:
get_hotspot_frac(df=master_high[master_high['tp53_group'] == '1_WILD_TYPE'],
                group_type=None,
                group=None)
Out[86]:
0 1 2
0 spot # frac
1 158 2 0.2705
2 248 2 0.394
3 273 2 0.249
4 nan 2 0.312
5 162 1 0.531
6 163 1 0.414
7 195 1 0.416
8 285 1 0.471
9 317 1 0.603
10 321 1 0.503

Splitting on the different level of CCF / VAF

In [ ]:
fig, ax = boxplot_sampletype(df=master_hotspot,
                  group='vaf_group',
                  palette={'low': tab10[0] , 'medium': tab10[1], 'high':tab10[2]},
                  order=['low', 'medium', 'high'],
                  metrics='frac_genome_altered',
                  figsize=(3,10),
                  title='Fraction of Genome Altered - VAF levels - {}'.format(cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_hotspot,
               group='vaf_group',
               metrics='frac_genome_altered',
               group_list=['low', 'medium'])
In [ ]:
fig, ax = boxplot_sampletype(df=master_hotspot,
                  group='ccf_group',
                  palette={'low': tab10[0] , 'medium': tab10[1], 'high':tab10[2]},
                  order=['low', 'medium', 'high'],
                  metrics='frac_genome_altered',
                  figsize=(3,10),
                  title='Fraction of Genome Altered - CCF levels - {}'.format(cancer),
                  xlim=[0,1])
plt.show()

get_statistics(df=master_hotspot,
               group='ccf_group',
               metrics='frac_genome_altered',
               group_list=['low', 'medium'])

Clinical Correlates

Age

In [15]:
#### TP53 Residual Groups#fig=plt.figure(figsize=(10,3))
ax = plt.subplot2grid(shape=(4,1), loc=(0,0), colspan=1)

sns.boxplot(x='Patient_Current_Age',data=master_no_wgd_cancer, ax=ax).set_title('Patient Age - {}'.format(cancer), weight='bold', fontsize=14)


style(ax)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

TP53 Residual Groups

In [16]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
                  group='tp53_res_group',
                  palette=palette_res,
                  order=res_group_list,
                  metrics='Patient_Current_Age',
                  figsize=(3,10),
                  title='Patient Current Age - {}'.format(cancer),
                  xlim=[20,100])
plt.show()

get_statistics(df=master_no_wgd_cancer,
               group='tp53_res_group',
               metrics='Patient_Current_Age',
               group_list=['tp53_res', 'no_tp53_res'])#### TP53 Subgroups
Out[16]:
0 1 2
0 size Patient_Current_Age
1 tp53_res 374 69
2 no_tp53_res 544 69
3 Statistics p-value
4 -0.25777 0.796585

TP53 Subgroups

In [17]:
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
                  group='tp53_group',
                  palette=palette,
                  order=group_list,
                  metrics='Patient_Current_Age',
                  figsize=(7,10),
                  title='Patient Current Age - {}'.format(cancer),
                  xlim=[20,100])
plt.show()

get_statistics(df=master_no_wgd_cancer,
               group='tp53_group',
               metrics='Patient_Current_Age',
               group_list=['1_WILD_TYPE', '>=1_cnLOH'])
Out[17]:
0 1 2
0 size Patient_Current_Age
1 1_WILD_TYPE 62 67
2 >=1_cnLOH 91 68
3 Statistics p-value
4 -0.533293 0.593831

Sex

In [18]:
h = get_groupby(master_no_wgd_cancer,'Sex', 'count').sort_values(by='count', ascending=False)
display(h)

h = h.T
h = h[['Male', 'Female']]
fig = plt.figure(figsize=(6,1))
ax = plt.subplot()

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax)
ax.legend(['Male', 'Female'],loc='center left', bbox_to_anchor=(1.1, 0.5), fontsize=11)
ax.set_title('Sex Distribution - {} - No WGD'.format(cancer), weight='bold', fontsize=18)

plt.show()
count
Sex
Female 1042
Male 733

Survival Analysis

In [25]:
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
In [99]:
data = master_no_wgd_cancer.dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)

data = data[['tp53_group', 'tp53_res_group', 'Overall Survival Status 0/1', 'Overall_Survival_Months']]

ix1 = data['tp53_res_group'] == 'tp53_res'
ix2 = data['tp53_res_group'] == 'no_tp53_res'

T_exp, E_exp = data.loc[ix1, 'Overall_Survival_Months'], data.loc[ix1, 'Overall Survival Status 0/1']
T_con, E_con = data.loc[ix2, 'Overall_Survival_Months'], data.loc[ix2, 'Overall Survival Status 0/1']

results = logrank_test(T_exp, T_con, event_observed_A=E_exp, event_observed_B=E_con)
results.print_summary()
t_0 -1
null_distribution chi squared
degrees_of_freedom 1
test_name logrank_test
test_statistic p
0 10.90 <0.005
In [23]:
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {}'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(res_group_list, range(len(res_group_list))):
    data = master_no_wgd_cancer[master_no_wgd_cancer['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
    kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
    kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
plt.show()
In [24]:
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {}'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(group_list, range(len(group_list))):
    data = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
    kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
    kmf.plot_survival_function(color = palette_list[i], ax=ax)
plt.show()

HIGH CCF

In [26]:
master_high = master_ccf[(master_ccf['ccf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]
In [27]:
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - High CCF (CCF>0.9)'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(res_group_list, range(len(res_group_list))):
    data = master_high[master_high['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
    except: pass
plt.show()


fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - High CCF (CCF>0.9)'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(group_list, range(len(group_list))):
    data = master_high[master_high['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = palette_list[i], ax=ax)
    except: pass
plt.show()

LOW CCF

In [28]:
master_low = master_ccf[(master_ccf['ccf_group'] == 'low') | (master_ccf['tp53_group'] == '0_HETLOSS')]
In [29]:
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - Low CCF (CCF<0.8)'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(res_group_list, range(len(res_group_list))):
    data = master_low[master_low['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
    except: pass
plt.show()


fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - Low CCF (CCF<0.8)'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(group_list, range(len(group_list))):
    data = master_low[master_low['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = palette_list[i], ax=ax)
    except: pass
plt.show()

Co-Drivers

EGFR
In [116]:
master_EGFR = get_master_codrivers(master=master_no_wgd_cancer,
                                   maf=maf_cohort_nowgd,
                                   symbol='EGFR')
In [129]:
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - EGFR'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(res_group_list, range(len(res_group_list))):
    data = master_EGFR[master_EGFR['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
    except: pass
plt.show()




fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - EGFR'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(group_list, range(len(group_list))):
    data = master_EGFR[master_EGFR['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = palette_list[i], ax=ax)
    except: pass
plt.show()
KRAS
In [130]:
master_KRAS = get_master_codrivers(master=master_no_wgd_cancer,
                                   maf=maf_cohort_nowgd,
                                   symbol='KRAS')

fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - KRAS'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(res_group_list, range(len(res_group_list))):
    data = master_KRAS[master_KRAS['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
    except: pass
plt.show()



fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - KRAS'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(group_list, range(len(group_list))):
    data = master_KRAS[master_KRAS['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = palette_list[i], ax=ax)
    except: pass
plt.show()
CDKN2A
In [131]:
master_CDKN2A = get_master_codrivers(master=master_no_wgd_cancer,
                                   maf=maf_cohort_nowgd,
                                   symbol='CDKN2A')

fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - CDKN2A'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(res_group_list, range(len(res_group_list))):
    data = master_CDKN2A[master_CDKN2A['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
    except: pass
plt.show()



fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - CDKN2A'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()

for group,i in zip(group_list, range(len(group_list))):
    data = master_CDKN2A[master_CDKN2A['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
    try:
        data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
        kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']),  label= group)
        kmf.plot_survival_function(color = palette_list[i], ax=ax)
    except: pass
plt.show()
In [ ]: